# install.packages("devtools")
library(devtools)
## Warning: package 'devtools' was built under R version 3.4.1
# source("http://bioconductor.org/biocLite.R")
# biocLite("qvalue")
# install_github("whitlock/OutFLANK")
library(OutFLANK)
## Loading required package: qvalue
if(!("adegenet" %in% installed.packages())){install.packages("adegenet")}
library(vcfR)
##
## ***** *** vcfR *** *****
## This is vcfR 1.5.0
## browseVignettes('vcfR') # Documentation
## citation('vcfR') # Citation
## ***** ***** ***** *****
library(adegenet)
## Loading required package: ade4
## Warning: package 'ade4' was built under R version 3.4.1
##
## /// adegenet 2.0.1 is loaded ////////////
##
## > overview: '?adegenet'
## > tutorials/doc/questions: 'adegenetWeb()'
## > bug reports/feature requests: adegenetIssues()
library(pcadapt)
library(lfmm)
library(LEA)
## devtools::install_github("privefl/bigsnpr")
library(bigsnpr)
## Loading required package: bigstatsr
library(bigstatsr)
library(vcfR)
library(RColorBrewer)
library(ggplot2)
library(fields)
## Loading required package: spam
## Loading required package: grid
## Spam version 1.4-0 (2016-08-29) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
##
## Attaching package: 'spam'
## The following objects are masked from 'package:base':
##
## backsolve, forwardsolve
## Loading required package: maps
vcf <- read.vcfR("evals/1236_RecomLowReg_VCFallsim1.vcf.gz")
##
Meta line 13 read in.
## All meta lines processed.
## Character matrix gt created.
## Character matrix gt rows: 12736
## Character matrix gt cols: 1009
## skip: 0
## nrows: 12736
## row_num: 0
##
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant: 12736
## All variants processed
ind0 <- read.table("evals/1236_RecomLowReg_outputIndAll.txt", header=TRUE)
muts <- read.table("evals/1236_RecomLowReg_outputMuts.txt", header=TRUE)
phen_env <- read.table("evals/1236_RecomLowReg_outputPhenEnv.txt", header=TRUE)
sim <- system("grep recomb evals/1236_RecomLowReg_outputSim.txt", TRUE)
muts$pa2 <- round(muts$selCoef^2*muts$freq*(1-muts$freq),3)
muts$prop=NA
muts$prop[muts$type=="m2"] <- muts$pa2[muts$type=="m2"]/sum(muts$pa2[muts$type=="m2"])
which(duplicated(muts$position))
## integer(0)
muts$count <- FALSE
muts$count[muts$prop>=0.01] <- TRUE
muts$count[muts$type=="m4" & muts$freq > 0.05] <- TRUE
muts
## position selCoef originGen type freq pa2 prop count
## 1 55166 -0.1407800 4 m2 1.0000 0.000 0.000000000 FALSE
## 2 59377 0.2438220 5994 m2 0.0010 0.000 0.000000000 FALSE
## 3 60924 -0.0487880 5392 m2 0.1425 0.000 0.000000000 FALSE
## 4 61367 0.6148460 5993 m2 0.0025 0.001 0.008928571 FALSE
## 5 63119 0.1743930 5996 m2 0.0030 0.000 0.000000000 FALSE
## 6 63357 0.0402488 5766 m2 0.0960 0.000 0.000000000 FALSE
## 7 64835 0.5254280 5997 m2 0.0005 0.000 0.000000000 FALSE
## 8 66264 -0.2040060 117 m2 1.0000 0.000 0.000000000 FALSE
## 9 74783 -0.2814420 5735 m2 0.1450 0.010 0.089285714 TRUE
## 10 76476 0.0385372 5822 m2 0.0305 0.000 0.000000000 FALSE
## 11 77162 -0.4895630 6000 m2 0.0005 0.000 0.000000000 FALSE
## 12 84721 0.2722690 3963 m2 0.2530 0.014 0.125000000 TRUE
## 13 85470 0.0705718 5995 m2 0.0035 0.000 0.000000000 FALSE
## 14 89074 0.2579440 5946 m2 0.0340 0.002 0.017857143 TRUE
## 15 89608 -0.5494260 5962 m2 0.0010 0.000 0.000000000 FALSE
## 16 92810 -1.0555100 5998 m2 0.0005 0.001 0.008928571 FALSE
## 17 95064 -0.4700100 3980 m2 0.7510 0.041 0.366071429 TRUE
## 18 95284 0.3243560 4862 m2 0.3510 0.024 0.214285714 TRUE
## 19 98753 -0.4121090 5998 m2 0.0005 0.000 0.000000000 FALSE
## 20 98973 0.1662070 417 m2 1.0000 0.000 0.000000000 FALSE
## 21 102738 0.3350400 6000 m2 0.0005 0.000 0.000000000 FALSE
## 22 106199 -0.4088850 5972 m2 0.0010 0.000 0.000000000 FALSE
## 23 110590 0.1460140 4546 m2 0.7540 0.004 0.035714286 TRUE
## 24 112536 -0.2875760 5947 m2 0.0090 0.001 0.008928571 FALSE
## 25 119926 -0.0105646 2471 m2 1.0000 0.000 0.000000000 FALSE
## 26 120860 -0.5917910 5961 m2 0.0035 0.001 0.008928571 FALSE
## 27 128050 -0.0431013 5339 m2 0.2650 0.000 0.000000000 FALSE
## 28 131674 -0.1409810 3704 m2 1.0000 0.000 0.000000000 FALSE
## 29 134613 -0.3318980 5998 m2 0.0010 0.000 0.000000000 FALSE
## 30 136416 -0.3858650 5705 m2 0.0145 0.002 0.017857143 TRUE
## 31 136835 0.2263680 4986 m2 0.2425 0.009 0.080357143 TRUE
## 32 140441 1.6589400 6000 m2 0.0005 0.001 0.008928571 FALSE
## 33 140576 -0.0571763 5888 m2 0.0120 0.000 0.000000000 FALSE
## 34 141982 0.3093680 1308 m2 1.0000 0.000 0.000000000 FALSE
## 35 143198 -0.2626850 5971 m2 0.0010 0.000 0.000000000 FALSE
## 36 149824 0.1493040 5975 m2 0.0245 0.001 0.008928571 FALSE
## 37 175000 0.8000000 5780 m4 0.9955 0.003 NA TRUE
### Color recombination regions ####
lgs <- seq(50000, 500000, by=50000) # linkage groups recombination breakpoints 0.5
lg_whereplot <- lgs - 25000
(recom_rates <- as.numeric(unlist(strsplit(sim[1], " "))[-1]))
## [1] 1.00000e-05 5.00000e-01 1.00000e-05 5.00000e-01 1.00000e-05
## [6] 5.00000e-01 1.00000e-05 5.00000e-01 1.00000e-05 5.00000e-01
## [11] 1.00000e-05 1.00000e-11 1.00000e-05 5.00000e-01 1.00000e-05
## [16] 1.00000e-11 1.00000e-05 5.00000e-01 1.00000e-05 1.00000e-08
## [21] 1.00000e-05 5.00000e-01 6.26869e-07 4.94044e-11 1.13234e-04
## [26] 1.55358e-04 9.89536e-07 2.12766e-05 2.15634e-05 5.96114e-07
## [31] 2.25052e-07 1.39301e-02 3.21115e-08 2.24422e-05 1.13803e-04
## [36] 4.19961e-09 1.23114e-04 4.62895e-06 3.95739e-07 3.42904e-06
## [41] 5.54896e-06 1.08548e-04 1.43782e-07 2.86846e-06 9.90242e-05
## [46] 4.78288e-05 7.57642e-07 4.76437e-04 5.70054e-08 1.45567e-06
## [51] 4.50553e-07 5.48482e-05 5.53232e-05 8.33817e-06 7.86841e-09
## [56] 3.62774e-04 1.81594e-06 6.62984e-06 2.14141e-07 5.42497e-10
## [61] 4.73403e-07 8.57399e-06 7.53417e-05 2.12371e-03 6.68129e-05
## [66] 1.39354e-07 1.18159e-04 1.50988e-03 5.95875e-04 5.94761e-05
## [71] 1.06801e-05 1.00944e-06 1.33482e-02 4.01259e-07 1.20515e-01
## [76] 9.65473e-07 7.61952e-06 6.39211e-09 8.21413e-05 7.27847e-06
## [81] 9.81278e-06 1.32774e-05 4.77499e-07 1.60898e-08 1.48064e-02
## [86] 3.33800e-05 8.34835e-01 1.21952e-02 5.02130e-09 2.52336e-03
## [91] 1.45148e-03 1.90139e-08 1.67139e-05 4.44781e-06 8.17499e-06
## [96] 2.96218e-01 3.34819e-05 1.16501e-05 1.58825e-07 1.52615e-04
## [101] 1.87822e-03 6.48994e-07 1.69560e-06 2.60184e-04 6.75128e-06
## [106] 2.84624e-03 2.39607e-05 1.00247e-07 3.36079e-04 1.20344e-05
## [111] 1.44094e-05 1.45813e-05 2.67124e-05 4.05815e-07 2.03668e-07
## [116] 2.45710e-04 2.67158e-10 9.35007e-06 8.48453e-06 5.45712e-03
## [121] 3.49522e-04 1.20048e-03 5.00000e-01 1.00000e-05
(recom_end <- as.integer(unlist(strsplit(sim[2], " "))[-1]))
## [1] 50000 50001 100000 100001 150000 150001 200000 200001 250000 250001
## [11] 270000 280000 300000 300001 320000 330000 350000 350001 370000 380000
## [21] 400000 400001 400213 401694 401739 402182 402246 402299 402663 403305
## [31] 403993 404452 404470 404979 405098 405986 406965 408000 408035 408725
## [41] 408942 409115 409217 410429 410492 410949 411100 411103 411586 411850
## [51] 412326 412665 414529 414925 415608 416082 416797 418443 418642 418719
## [61] 419196 419421 419675 421406 421537 421851 422023 422804 423876 424257
## [71] 425048 425130 425469 425699 425836 426605 427114 427185 427417 427518
## [81] 427964 429578 430455 431802 432200 432753 432833 432853 433121 433122
## [91] 433301 433349 433357 433454 433474 433800 433919 434218 434751 435511
## [101] 435782 436138 437216 437892 438289 438651 440807 442869 443146 444030
## [111] 444245 444626 445080 445780 445898 447105 447404 447950 448210 448693
## [121] 449939 450000 450001 500000
recom <- data.frame(recom_rates, recom_end)
recom$logrates <- log10(recom_rates)
# plot r=0.5 as black
# plot r=1e-11 as white
(brks <- with(recom, seq(min(logrates), max(logrates), length.out = 10)))
## [1] -11.00000000 -9.78648882 -8.57297763 -7.35946645 -6.14595527
## [6] -4.93244408 -3.71893290 -2.50542172 -1.29191053 -0.07839935
grps <- with(recom, cut(logrates, breaks = brks, include.lowest = TRUE))
nlevels(grps)
## [1] 9
colfunc <- paste(colorRampPalette(colors=c("lightblue", "white", "grey90"))(9), 70, sep="")
recom$col <- colfunc[grps]
plot(recom$logrates, col=recom$col)
### Replace chromsome 1 with actual chromosome positions ####
ends=c(0,lgs)
dim(vcf@gt)
## [1] 12736 1001
vcf@fix[,"CHROM"] <- NA
POS <- as.numeric(vcf@fix[,"POS"])
for (i in 1:(length(ends)-1)){
cond <- POS >= ends[i] & POS < ends[i+1]
print(c(ends[i], ends[i+1], sum(cond)))
vcf@fix[cond,"CHROM"] = i
}
## [1] 0 50000 1303
## [1] 50000 100000 1338
## [1] 100000 150000 1272
## [1] 150000 200000 1266
## [1] 200000 250000 1327
## [1] 250000 300000 1156
## [1] 300000 350000 1290
## [1] 350000 400000 1250
## [1] 400000 450000 1266
## [1] 450000 500000 1268
table(vcf@fix[,"CHROM"])
##
## 1 10 2 3 4 5 6 7 8 9
## 1303 1268 1338 1272 1266 1327 1156 1290 1250 1266
my_ord <- order(as.numeric(vcf@fix[,"POS"]))
vcf2 <- vcf
vcf2 <- vcf[my_ord,]
head(vcf2)
## [1] "***** Object of class 'vcfR' *****"
## [1] "***** Meta section *****"
## [1] "##fileformat=VCFv4.2"
## [1] "##fileDate=20170924"
## [1] "##source=SLiM"
## [1] "##INFO=<ID=MID,Number=1,Type=Integer,Description=\"Mutation ID in SLiM\">"
## [1] "##INFO=<ID=S,Number=1,Type=Float,Description=\"Selection Coefficient\">"
## [1] "##INFO=<ID=DOM,Number=1,Type=Float,Description=\"Dominance\">"
## [1] "First 6 rows."
## [1]
## [1] "***** Fixed section *****"
## CHROM POS ID REF ALT QUAL FILTER
## [1,] "1" "43" NA "A" "T" "1000" "PASS"
## [2,] "1" "104" NA "A" "T" "1000" "PASS"
## [3,] "1" "160" NA "A" "T" "1000" "PASS"
## [4,] "1" "204" NA "A" "T" "1000" "PASS"
## [5,] "1" "214" NA "A" "T" "1000" "PASS"
## [6,] "1" "275" NA "A" "T" "1000" "PASS"
## [1]
## [1] "***** Genotype section *****"
## FORMAT i0 i1 i2 i3 i4
## [1,] "GT" "0|0" "0|0" "0|0" "0|0" "0|0"
## [2,] "GT" "0|0" "0|0" "0|0" "0|0" "0|0"
## [3,] "GT" "0|0" "0|0" "0|0" "0|0" "0|0"
## [4,] "GT" "0|0" "0|0" "0|0" "0|0" "0|0"
## [5,] "GT" "0|0" "0|0" "0|0" "0|0" "0|0"
## [6,] "GT" "0|0" "0|0" "0|0" "0|0" "0|0"
## [1] "First 6 columns only."
## [1]
## [1] "Unique GT formats:"
## [1] "GT"
## [1]
head(vcf)
## [1] "***** Object of class 'vcfR' *****"
## [1] "***** Meta section *****"
## [1] "##fileformat=VCFv4.2"
## [1] "##fileDate=20170924"
## [1] "##source=SLiM"
## [1] "##INFO=<ID=MID,Number=1,Type=Integer,Description=\"Mutation ID in SLiM\">"
## [1] "##INFO=<ID=S,Number=1,Type=Float,Description=\"Selection Coefficient\">"
## [1] "##INFO=<ID=DOM,Number=1,Type=Float,Description=\"Dominance\">"
## [1] "First 6 rows."
## [1]
## [1] "***** Fixed section *****"
## CHROM POS ID REF ALT QUAL FILTER
## [1,] "2" "55167" NA "A" "T" "1000" "PASS"
## [2,] "7" "327919" NA "A" "T" "1000" "PASS"
## [3,] "7" "328998" NA "A" "T" "1000" "PASS"
## [4,] "5" "243497" NA "A" "T" "1000" "PASS"
## [5,] "4" "153461" NA "A" "T" "1000" "PASS"
## [6,] "4" "183906" NA "A" "T" "1000" "PASS"
## [1]
## [1] "***** Genotype section *****"
## FORMAT i0 i1 i2 i3 i4
## [1,] "GT" "1|1" "1|1" "1|1" "1|1" "1|1"
## [2,] "GT" "0|0" "1|1" "0|0" "1|0" "0|0"
## [3,] "GT" "0|0" "1|1" "0|0" "1|0" "0|0"
## [4,] "GT" "1|0" "0|1" "1|1" "0|0" "1|1"
## [5,] "GT" "0|0" "0|0" "0|1" "0|1" "0|0"
## [6,] "GT" "0|0" "1|0" "0|0" "0|0" "0|0"
## [1] "First 6 columns only."
## [1]
## [1] "Unique GT formats:"
## [1] "GT"
## [1]
Plotting functions
plot_r_legend <- function(){
### Plot recombination legend
xl <- 1
yb <- 1
xr <- 1.5
yt <- 2
ncol = 9
par(mar=c(5.1,2.5,3.1,0.5))
plot(NA,type="n",ann=FALSE,xlim=c(1,2),ylim=c(1,2),xaxt="n",yaxt="n",bty="n")
mtext("r", side=3, adj=0.2, cex=2)
rect(
xl,
head(seq(yb,yt,(yt-yb)/ncol),-1),
xr,
tail(seq(yb,yt,(yt-yb)/ncol),-1),
col=colfunc
)
mtext(c(10^(round(brks)[1:9]),0.5),side=2,at=seq(yb,yt,(yt-yb)/(ncol)),las=2,cex=0.7)
}
### Plot function
recom_end2 = c(0, recom_end)
plot_layers <- function(y_head=0, y_arrows=c(1,0.25), ...){
### Plot recombination variation
for (i in 1:nrow(recom))
{
polygon(x = c(recom_end2[i], recom$recom_end[i], recom$recom_end[i], recom_end2[i]),
y = c(-1000, -1000, 1000, 1000),
col=as.character(recom$col[i]), border = NA)
}
abline(v=lgs)
text(lg_whereplot, y = y_head,
labels = c("LG1\nNeut", "LG2\nQTL", "LG3\nQTL",
"LG4\nSS", "LG5\nBS",
"LG6\nlow r+\nBS", "LG7\nlow r",
"LG8\nmed r", "LG9\nr var", "LG10\nNeut"))
### Add QTLs and Sweep Location
arrows(muts$position[muts$count], y_arrows[1], muts$position[muts$count], y_arrows[2], col="orange", lwd=muts$prop[muts$count]*20, length = 0.1)
arrows(muts$position[muts$type=="m4"], y_arrows[1], muts$position[muts$type=="m4"], y_arrows[2], col="purple", lwd=2, length = 0.1)
} #end plot function
layout(matrix(1:2,nrow=1),widths=c(0.8,0.2))
par(mar=c(5.1,3.1,3.1,1.9))
plot(0,0, col="white", xlim=c(0, 500000), ylim=c(-1,1), xaxs="i", yaxt="n", ylab="", xlab="Position (bp)")
plot_layers()
plot_r_legend()
# NB: Creates a vcfR object (stored in RAM) which size is twice as big as the original vcf file. So when dealing with large data, make sure you have enough RAM space available before proceeding.
geno <- extract.gt(vcf2) # Character matrix containing the genotypes
position <- getPOS(vcf2) # Positions in bp
chromosome <- getCHROM(vcf2) # Chromosome information
G <- matrix(NA, nrow = nrow(geno), ncol = ncol(geno))
G[geno %in% c("0/0", "0|0")] <- 0
G[geno %in% c("0/1", "1/0", "1|0", "0|1")] <- 1
G[geno %in% c("1/1", "1|1")] <- 2
## Remove fixed loci or all heterozygotes ####
dim(G)
## [1] 12736 1000
head(G[1:10,1:10])
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 0 0 0 0 0 0 0 0 0 0
## [2,] 0 0 0 0 0 0 0 0 0 0
## [3,] 0 0 0 0 0 0 0 0 0 1
## [4,] 0 0 0 0 0 0 0 0 0 0
## [5,] 0 0 0 0 0 0 0 0 0 0
## [6,] 0 0 0 0 0 0 0 0 0 0
rem = c(which(rowSums(G)==0), which(rowSums(G-2)==0)) ## fixed loci
position[rem]
## [1] 55167 66265 98974 119927 131675 141983
training <- list(G = G[-rem,], position = position[-rem], chromosome = chromosome[-rem])
vcf_filt <- vcf2[-rem,]
dim(vcf_filt@gt)
## [1] 12730 1001
### optional assignment to pops
toclust <- ind0[,c("x","y")]
d <- dist(toclust)
hc <- hclust(d, method="ward.D")
#fit <- kmeans(toclust, 30)
#plot(ind$x, ind$y, pch=fit$cluster, col=fit$cluster+1)
k <- 39
group <- cutree(hc, k=k)
table(group)
## group
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## 33 36 27 22 27 13 26 18 19 18 27 26 24 36 34 23 15 33 46 54 24 28 16 17 32
## 26 27 28 29 30 31 32 33 34 35 36 37 38 39
## 42 10 37 8 20 22 34 33 26 23 20 20 19 12
(group_env <- sort(round(tapply(ind0$envi, group, mean), 1)))
## 37 1 25 23 36 14 20 30 28 32 18 24 26 2 27
## -1.6 -1.3 -1.3 -1.1 -1.1 -1.0 -1.0 -1.0 -0.7 -0.7 -0.5 -0.5 -0.5 -0.4 -0.4
## 12 31 4 9 10 3 34 8 38 13 21 22 5 11 15
## -0.3 -0.3 -0.2 -0.2 -0.2 -0.1 -0.1 0.0 0.0 0.1 0.3 0.3 0.4 0.5 0.5
## 33 7 35 17 39 6 16 29 19
## 0.6 0.7 0.7 0.8 0.8 0.9 0.9 1.0 1.1
(group_table <- data.frame(group=names(group_env), group_env, new_g = 1:39))
## group group_env new_g
## 37 37 -1.6 1
## 1 1 -1.3 2
## 25 25 -1.3 3
## 23 23 -1.1 4
## 36 36 -1.1 5
## 14 14 -1.0 6
## 20 20 -1.0 7
## 30 30 -1.0 8
## 28 28 -0.7 9
## 32 32 -0.7 10
## 18 18 -0.5 11
## 24 24 -0.5 12
## 26 26 -0.5 13
## 2 2 -0.4 14
## 27 27 -0.4 15
## 12 12 -0.3 16
## 31 31 -0.3 17
## 4 4 -0.2 18
## 9 9 -0.2 19
## 10 10 -0.2 20
## 3 3 -0.1 21
## 34 34 -0.1 22
## 8 8 0.0 23
## 38 38 0.0 24
## 13 13 0.1 25
## 21 21 0.3 26
## 22 22 0.3 27
## 5 5 0.4 28
## 11 11 0.5 29
## 15 15 0.5 30
## 33 33 0.6 31
## 7 7 0.7 32
## 35 35 0.7 33
## 17 17 0.8 34
## 39 39 0.8 35
## 6 6 0.9 36
## 16 16 0.9 37
## 29 29 1.0 38
## 19 19 1.1 39
ind0$group <- group
ind2 <- merge(ind0, group_table)
head(ind2)
## group id x y phenotype1 envi phenotype2 group_env
## 1 1 0 0.967790 0.156432 0.1855210 -1.23276 0.1855210 -1.3
## 2 1 766 0.989254 0.184359 0.3714770 -1.29156 0.3714770 -1.3
## 3 1 831 0.966644 0.155146 -1.3844000 -1.22898 -1.3844000 -1.3
## 4 1 813 0.997858 0.192452 -0.0842549 -1.31945 -0.0842549 -1.3
## 5 1 783 0.994767 0.190174 0.1650630 -1.30821 0.1650630 -1.3
## 6 1 650 0.962838 0.159555 -1.1741400 -1.19981 -1.1741400 -1.3
## new_g
## 1 2
## 2 2
## 3 2
## 4 2
## 5 2
## 6 2
# the merge puts individuals out of order relative to the genotype matrix
# the id should put them back into order
ind <- ind2[order(ind2$id),]
head(ind)
## group id x y phenotype1 envi phenotype2 group_env
## 1 1 0 0.967790 0.1564320 0.1855210 -1.232760 0.1855210 -1.3
## 50 2 1 0.242315 0.0480874 -0.2245730 -0.408370 -0.2245730 -0.4
## 82 3 2 0.816066 0.5568300 0.0472845 -0.114717 0.0472845 -0.1
## 104 4 3 0.770902 0.9594540 -0.8786200 -0.207345 -0.8786200 -0.2
## 119 5 4 0.380837 0.0457108 0.5888120 0.368229 0.5888120 0.4
## 158 6 5 0.320940 0.5404820 0.2488010 0.881583 0.2488010 0.9
## new_g
## 1 2
## 50 14
## 82 21
## 104 18
## 119 28
## 158 36
par(mfrow=c(1,1))
plot(ind$x, ind$y, pch=ind$group%%6, col=adjustcolor(ind$group%%3+2, alpha=0.2))
text(tapply(ind$x, ind$new_g, mean), tapply(ind$y, ind$new_g, mean), label=1:39)
#write.table(ind, "outputIndAll_pop.txt")
G0<-add_code256(big_copy(t(training$G),type="raw"),code=bigsnpr:::CODE_012)
## Warning: Assignment will down cast from double to raw.
## Hint: To remove this warning, use options(bigstatsr.typecast.warning = FALSE).
#puts it in the raw format and stores likelihood genotype probability
dim(G0)
## [1] 1000 12730
str(G)
## num [1:12736, 1:1000] 0 0 0 0 0 0 0 0 0 0 ...
head(G[,1:10])
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 0 0 0 0 0 0 0 0 0 0
## [2,] 0 0 0 0 0 0 0 0 0 0
## [3,] 0 0 0 0 0 0 0 0 0 1
## [4,] 0 0 0 0 0 0 0 0 0 0
## [5,] 0 0 0 0 0 0 0 0 0 0
## [6,] 0 0 0 0 0 0 0 0 0 0
newpc<-snp_autoSVD(G=G0,infos.chr = training$chromosome,infos.pos = training$position)
## Phase of clumping at r2 > 0.2.. keep 7438 SNPs.
##
## Iteration 1:
## Computing SVD..
##
## Converged!
# this is doing SNP pruning - removing correlated SNPs
# take snps with highest MAF and correlate snps around it
# Snps with R^2 > 0.2 are removed
# the subset is the indexes of the remaining SNPs
str(newpc)
## List of 7
## $ d : num [1:10] 208 164 162 159 158 ...
## $ u : num [1:1000, 1:10] 0.00369 0.01023 0.00133 0.01267 -0.03907 ...
## $ v : num [1:7438, 1:10] -0.00236 -0.00826 0.01564 -0.00537 0.00347 ...
## $ niter : int 10
## $ nops : int 178
## $ center: num [1:7438] 0.002 0.016 0.067 0.001 0.003 0.105 0.014 0.002 0.006 0.005 ...
## $ scale : num [1:7438] 0.0447 0.126 0.2545 0.0316 0.0547 ...
## - attr(*, "class")= chr "big_SVD"
## - attr(*, "subset")= int [1:7438] 1 2 3 4 6 7 8 9 11 12 ...
## - attr(*, "lrldr")='data.frame': 0 obs. of 3 variables:
## ..$ Chr : int(0)
## ..$ Start: int(0)
## ..$ Stop : int(0)
plot(newpc)
which_pruned <- attr(newpc, which="subset")
length(which_pruned)
## [1] 7438
### Calculate average LD around each SNP ####
LD <- LD2<- rep(NA, ncol(G0))
# the following is not my most efficient lines of code
for(i in 26:(ncol(G0)-26)){
LD[i]=mean(cor(G0[,(i-25):(i+25)])[,25])
}
layout(matrix(1))
plot(training$position, abs(LD), pch=20, ylim=c(0,0.18), xaxs="i", col=rgb(0,0,0,0.5), xlab="Position (bp)", ylab="Average LD 50-SNP windows")
plot_layers(y_head=0.17, y_arrows=c(0.02, 0))
hist(training$position[which_pruned], breaks=seq(0,500000, by=10000), col="lightgrey")
plot_layers(y_head=20, y_arrows=c(40,25))
aux<-pcadapt(training$G,K=10)
## Number of SNPs: 12730
## Number of individuals: 1000
str(aux)
## List of 10
## $ scores : num [1:1000, 1:10] 0.0287 -0.0677 0.0303 -0.0179 0.0331 ...
## $ singular.values: num [1:10] 11.48 7.6 6.31 5.91 5.63 ...
## $ zscores : num [1:12730, 1:10] 0 0 0 0 0 ...
## $ loadings : num [1:12730, 1:10] 0 0 0 0 0 ...
## $ maf : num [1:12730] 0.001 0.008 0.0335 0.0005 0.0015 0.0015 0.0525 0.007 0.001 0.0005 ...
## $ missing : num [1:12730] 0 0 0 0 0 0 0 0 0 0 ...
## $ stat : num [1:12730(1d)] NA NA NA NA NA ...
## $ gif : num 1.21
## $ chi2.stat : num [1:12730(1d)] NA NA NA NA NA ...
## $ pvalues : num [1:12730] NA NA NA NA NA ...
## - attr(*, "class")= chr "pcadapt"
## - attr(*, "K")= num 10
## - attr(*, "data.type")= chr "genotype"
## - attr(*, "method")= chr "mahalanobis"
## - attr(*, "min.maf")= num 0.05
plot(aux,option="screeplot")
x <-pcadapt(training$G,K=4)
## Number of SNPs: 12730
## Number of individuals: 1000
summary(x)
## Length Class Mode
## scores 4000 -none- numeric
## singular.values 4 -none- numeric
## zscores 50920 -none- numeric
## loadings 50920 -none- numeric
## maf 12730 -none- numeric
## missing 12730 -none- numeric
## stat 12730 -none- numeric
## gif 1 -none- numeric
## chi2.stat 12730 -none- numeric
## pvalues 12730 -none- numeric
par(mar=c(4,4,1,1))
plot(x$scores[,1], x$scores[,2])
inlowr <- which(training$position>(325000-50) & training$position<(325000+50))
(snppos_inlowr <- training$position[inlowr])
## [1] 324954 324968 324969
haplotype <- as.factor(paste(training$G[inlowr[1],], training$G[inlowr[2],], training$G[inlowr[3],], sep="_"))
table(haplotype)
## haplotype
## 0_0_0 0_0_1 0_0_2 0_1_0 0_1_1 0_2_0 1_1_0 1_2_0
## 389 70 5 403 32 93 5 3
qplot(x$scores[,1], x$scores[,2], colour=haplotype, main="Individual scores without LD pruning") + scale_colour_manual(values = two.colors(n=8,"red", "blue", middle="grey")) + theme_bw()
#plot_layers(ylim=c(min(x$loadings[,1]), max(x$loadings[,1])), ylab="Loadings PC1")
layout(matrix(c(1,2,3,4,6,5,5,6),nrow=4),widths=c(0.8,0.2))
par(oma=c(3,3,1,0), mar=c(2,2,0,0))
## Top plot
summary(x$loadings[,1])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -9.714207 0.000000 0.000000 -0.002193 0.000000 9.714207
plot(training$position,x$loadings[,1], xaxs="i", pch=20, ylim=c(-11, 15))
plot_layers(y_head = 12, y_arrows=c(-8, -11))
## Middle plot
summary(x$loadings[,2])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -11.33192 0.00000 0.00000 0.01442 0.00000 11.62424
plot(training$position,x$loadings[,2], xaxs="i", pch=20, ylim=c(-11, 13))
plot_layers(y_head = 20, y_arrows=c(-8, -11))
## Middle plot
summary(x$loadings[,3])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -10.76386 0.00000 0.00000 -0.02859 0.00000 10.76386
plot(training$position,x$loadings[,3], xaxs="i", pch=20, ylim=c(-15, 15))
plot_layers(y_head = 20, y_arrows=c(-12, -15))
## Bottom plot
summary(x$loadings[,4])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -17.805202 0.000000 0.000000 0.007629 0.000000 17.805202
plot(training$position,x$loadings[,3], xaxs="i", pch=20, ylim=c(-13, 13))
plot_layers(y_head = 20, y_arrows=c(-10, -13))
## Right plot
plot_r_legend()
mtext("Position (bp)", outer=TRUE, side=1, line=1, adj=0.4)
mtext("Loading on PC axis", outer=TRUE, side=2, line=1, adj=0.5)
dim(training$G)
## [1] 12730 1000
aux<-pcadapt(training$G[which_pruned,],K=30)
## Number of SNPs: 7438
## Number of individuals: 1000
str(aux)
## List of 10
## $ scores : num [1:1000, 1:30] -0.000653 0.015434 0.00039 0.012151 -0.051125 ...
## $ singular.values: num [1:30] 4.81 3.77 3.73 3.61 3.58 ...
## $ zscores : num [1:7438, 1:30] 0 0 0 0 0 ...
## $ loadings : num [1:7438, 1:30] 0 0 0 0 0 ...
## $ maf : num [1:7438] 0.001 0.008 0.0335 0.0005 0.0015 0.0525 0.007 0.001 0.003 0.0025 ...
## $ missing : num [1:7438] 0 0 0 0 0 0 0 0 0 0 ...
## $ stat : num [1:7438(1d)] NA NA NA NA NA ...
## $ gif : num 1.11
## $ chi2.stat : num [1:7438(1d)] NA NA NA NA NA ...
## $ pvalues : num [1:7438] NA NA NA NA NA ...
## - attr(*, "class")= chr "pcadapt"
## - attr(*, "K")= num 30
## - attr(*, "data.type")= chr "genotype"
## - attr(*, "method")= chr "mahalanobis"
## - attr(*, "min.maf")= num 0.05
plot(aux,option="screeplot")
x_LD <-pcadapt(training$G[which_pruned,],K=3)
## Number of SNPs: 7438
## Number of individuals: 1000
summary(x_LD)
## Length Class Mode
## scores 3000 -none- numeric
## singular.values 3 -none- numeric
## zscores 22314 -none- numeric
## loadings 22314 -none- numeric
## maf 7438 -none- numeric
## missing 7438 -none- numeric
## stat 7438 -none- numeric
## gif 1 -none- numeric
## chi2.stat 7438 -none- numeric
## pvalues 7438 -none- numeric
par(mar=c(4,4,1,1))
qplot(x_LD$scores[,1], x_LD$scores[,2], colour=ind$envi, main="Individual scores with LD pruning", xlab="PC1 scores", ylab="PC2 scores" ) + theme_bw() + labs(colour="Envi") + scale_color_gradient2( low="blue", mid="white",
high="red", space ="Lab" )
#plot_layers(ylim=c(min(x$loadings[,1]), max(x$loadings[,1])), ylab="Loadings PC1")
layout(matrix(c(1,2,3,3),nrow=2),widths=c(0.8,0.2))
par(oma=c(3,3,1,0), mar=c(2,2,0,0))
## Top plot
summary(x_LD$loadings[,1])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -9.91659 0.00000 0.00000 -0.02043 0.00000 11.68196
plot(training$position[which_pruned],x_LD$loadings[,1], xaxs="i", pch=20, ylim=c(-11, 15))
plot_layers(y_head = 12, y_arrows=c(-5, -11))
## Middle plot
summary(x_LD$loadings[,2])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -6.011610 0.000000 0.000000 0.008229 0.000000 8.099326
plot(training$position[which_pruned],x_LD$loadings[,2], xaxs="i", pch=20, ylim=c(-11, 13))
plot_layers(y_head = 20, y_arrows=c(-8, -11))
## Right plot
plot_r_legend()
mtext("Position (bp)", outer=TRUE, side=1, line=1, adj=0.4)
mtext("Loading on PC axis", outer=TRUE, side=2, line=1, adj=0.5)
write.geno(t(training$G), "genotypes.geno")
## [1] "genotypes.geno"
project = snmf("genotypes.geno",
K = 1:4,
entropy = TRUE,
repetitions = 3,
project = "new")
## The project is saved into :
## genotypes.snmfProject
##
## To load the project, use:
## project = load.snmfProject("genotypes.snmfProject")
##
## To remove the project, use:
## remove.snmfProject("genotypes.snmfProject")
##
## [1] 1914330943
## [1] "*************************************"
## [1] "* create.dataset *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 12730
## -s (seed random init) 1914330943
## -r (percentage of masked data) 0.05
## -x (genotype file in .geno format) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.geno
## -o (output file in .geno format) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
##
## Write genotype file with masked data, /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno: OK.
##
## [1] "*************************************"
## [1] "* sNMF K = 1 repetition 1 *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 12730
## -K (number of ancestral pops) 1
## -x (input file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
## -q (individual admixture file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run1/genotypes_r1.1.Q
## -g (ancestral frequencies file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run1/genotypes_r1.1.G
## -i (number max of iterations) 200
## -a (regularization parameter) 10
## -s (seed random init) 140460229812031
## -e (tolerance error) 1E-05
## -p (number of processes) 1
## - diploid
##
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno: OK.
##
##
## Main algorithm:
##
## Least-square error: 2184733.310493
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run1/genotypes_r1.1.Q: OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run1/genotypes_r1.1.G: OK.
##
## [1] "*************************************"
## [1] "* cross-entropy estimation *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 12730
## -K (number of ancestral pops) 1
## -x (genotype file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.geno
## -q (individual admixture) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run1/genotypes_r1.1.Q
## -g (ancestral frequencies) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run1/genotypes_r1.1.G
## -i (with masked genotypes) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
## - diploid
##
## Cross-Entropy (all data): 0.297424
## Cross-Entropy (masked data): 0.299513
## The project is saved into :
## genotypes.snmfProject
##
## To load the project, use:
## project = load.snmfProject("genotypes.snmfProject")
##
## To remove the project, use:
## remove.snmfProject("genotypes.snmfProject")
##
## [1] "*************************************"
## [1] "* sNMF K = 2 repetition 1 *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 12730
## -K (number of ancestral pops) 2
## -x (input file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
## -q (individual admixture file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run1/genotypes_r1.2.Q
## -g (ancestral frequencies file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run1/genotypes_r1.2.G
## -i (number max of iterations) 200
## -a (regularization parameter) 10
## -s (seed random init) 4595222878120206143
## -e (tolerance error) 1E-05
## -p (number of processes) 1
## - diploid
##
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno: OK.
##
##
## Main algorithm:
## [ ]
## [======]
## Number of iterations: 16
##
## Least-square error: 2133085.167554
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run1/genotypes_r1.2.Q: OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run1/genotypes_r1.2.G: OK.
##
## [1] "*************************************"
## [1] "* cross-entropy estimation *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 12730
## -K (number of ancestral pops) 2
## -x (genotype file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.geno
## -q (individual admixture) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run1/genotypes_r1.2.Q
## -g (ancestral frequencies) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run1/genotypes_r1.2.G
## -i (with masked genotypes) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
## - diploid
##
## Cross-Entropy (all data): 0.290459
## Cross-Entropy (masked data): 0.294316
## The project is saved into :
## genotypes.snmfProject
##
## To load the project, use:
## project = load.snmfProject("genotypes.snmfProject")
##
## To remove the project, use:
## remove.snmfProject("genotypes.snmfProject")
##
## [1] "*************************************"
## [1] "* sNMF K = 3 repetition 1 *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 12730
## -K (number of ancestral pops) 3
## -x (input file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
## -q (individual admixture file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run1/genotypes_r1.3.Q
## -g (ancestral frequencies file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run1/genotypes_r1.3.G
## -i (number max of iterations) 200
## -a (regularization parameter) 10
## -s (seed random init) 140460229812031
## -e (tolerance error) 1E-05
## -p (number of processes) 1
## - diploid
##
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno: OK.
##
##
## Main algorithm:
## [ ]
## [===============]
## Number of iterations: 39
##
## Least-square error: 2115270.930074
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run1/genotypes_r1.3.Q: OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run1/genotypes_r1.3.G: OK.
##
## [1] "*************************************"
## [1] "* cross-entropy estimation *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 12730
## -K (number of ancestral pops) 3
## -x (genotype file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.geno
## -q (individual admixture) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run1/genotypes_r1.3.Q
## -g (ancestral frequencies) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run1/genotypes_r1.3.G
## -i (with masked genotypes) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
## - diploid
##
## Cross-Entropy (all data): 0.28694
## Cross-Entropy (masked data): 0.292057
## The project is saved into :
## genotypes.snmfProject
##
## To load the project, use:
## project = load.snmfProject("genotypes.snmfProject")
##
## To remove the project, use:
## remove.snmfProject("genotypes.snmfProject")
##
## [1] "*************************************"
## [1] "* sNMF K = 4 repetition 1 *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 12730
## -K (number of ancestral pops) 4
## -x (input file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
## -q (individual admixture file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run1/genotypes_r1.4.Q
## -g (ancestral frequencies file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run1/genotypes_r1.4.G
## -i (number max of iterations) 200
## -a (regularization parameter) 10
## -s (seed random init) 140460229812031
## -e (tolerance error) 1E-05
## -p (number of processes) 1
## - diploid
##
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno: OK.
##
##
## Main algorithm:
## [ ]
## [=============================]
## Number of iterations: 77
##
## Least-square error: 2104544.184406
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run1/genotypes_r1.4.Q: OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run1/genotypes_r1.4.G: OK.
##
## [1] "*************************************"
## [1] "* cross-entropy estimation *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 12730
## -K (number of ancestral pops) 4
## -x (genotype file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.geno
## -q (individual admixture) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run1/genotypes_r1.4.Q
## -g (ancestral frequencies) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run1/genotypes_r1.4.G
## -i (with masked genotypes) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
## - diploid
##
## Cross-Entropy (all data): 0.285632
## Cross-Entropy (masked data): 0.290917
## The project is saved into :
## genotypes.snmfProject
##
## To load the project, use:
## project = load.snmfProject("genotypes.snmfProject")
##
## To remove the project, use:
## remove.snmfProject("genotypes.snmfProject")
##
## [1] 1658766106
## [1] "*************************************"
## [1] "* create.dataset *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 12730
## -s (seed random init) 1658766106
## -r (percentage of masked data) 0.05
## -x (genotype file in .geno format) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.geno
## -o (output file in .geno format) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
##
## Write genotype file with masked data, /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno: OK.
##
## [1] "*************************************"
## [1] "* sNMF K = 1 repetition 2 *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 12730
## -K (number of ancestral pops) 1
## -x (input file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
## -q (individual admixture file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run2/genotypes_r2.1.Q
## -g (ancestral frequencies file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run2/genotypes_r2.1.G
## -i (number max of iterations) 200
## -a (regularization parameter) 10
## -s (seed random init) 140459974247194
## -e (tolerance error) 1E-05
## -p (number of processes) 1
## - diploid
##
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno: OK.
##
##
## Main algorithm:
##
## Least-square error: 2184045.338543
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run2/genotypes_r2.1.Q: OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run2/genotypes_r2.1.G: OK.
##
## [1] "*************************************"
## [1] "* cross-entropy estimation *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 12730
## -K (number of ancestral pops) 1
## -x (genotype file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.geno
## -q (individual admixture) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run2/genotypes_r2.1.Q
## -g (ancestral frequencies) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run2/genotypes_r2.1.G
## -i (with masked genotypes) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
## - diploid
##
## Cross-Entropy (all data): 0.297444
## Cross-Entropy (masked data): 0.299182
## The project is saved into :
## genotypes.snmfProject
##
## To load the project, use:
## project = load.snmfProject("genotypes.snmfProject")
##
## To remove the project, use:
## remove.snmfProject("genotypes.snmfProject")
##
## [1] "*************************************"
## [1] "* sNMF K = 2 repetition 2 *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 12730
## -K (number of ancestral pops) 2
## -x (input file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
## -q (individual admixture file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run2/genotypes_r2.2.Q
## -g (ancestral frequencies file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run2/genotypes_r2.2.G
## -i (number max of iterations) 200
## -a (regularization parameter) 10
## -s (seed random init) 4636033605571625754
## -e (tolerance error) 1E-05
## -p (number of processes) 1
## - diploid
##
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno: OK.
##
##
## Main algorithm:
## [ ]
## [======]
## Number of iterations: 16
##
## Least-square error: 2133438.318838
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run2/genotypes_r2.2.Q: OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run2/genotypes_r2.2.G: OK.
##
## [1] "*************************************"
## [1] "* cross-entropy estimation *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 12730
## -K (number of ancestral pops) 2
## -x (genotype file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.geno
## -q (individual admixture) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run2/genotypes_r2.2.Q
## -g (ancestral frequencies) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run2/genotypes_r2.2.G
## -i (with masked genotypes) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
## - diploid
##
## Cross-Entropy (all data): 0.290486
## Cross-Entropy (masked data): 0.293674
## The project is saved into :
## genotypes.snmfProject
##
## To load the project, use:
## project = load.snmfProject("genotypes.snmfProject")
##
## To remove the project, use:
## remove.snmfProject("genotypes.snmfProject")
##
## [1] "*************************************"
## [1] "* sNMF K = 3 repetition 2 *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 12730
## -K (number of ancestral pops) 3
## -x (input file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
## -q (individual admixture file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run2/genotypes_r2.3.Q
## -g (ancestral frequencies file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run2/genotypes_r2.3.G
## -i (number max of iterations) 200
## -a (regularization parameter) 10
## -s (seed random init) 140459974247194
## -e (tolerance error) 1E-05
## -p (number of processes) 1
## - diploid
##
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno: OK.
##
##
## Main algorithm:
## [ ]
## [==============]
## Number of iterations: 38
##
## Least-square error: 2115496.525583
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run2/genotypes_r2.3.Q: OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run2/genotypes_r2.3.G: OK.
##
## [1] "*************************************"
## [1] "* cross-entropy estimation *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 12730
## -K (number of ancestral pops) 3
## -x (genotype file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.geno
## -q (individual admixture) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run2/genotypes_r2.3.Q
## -g (ancestral frequencies) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run2/genotypes_r2.3.G
## -i (with masked genotypes) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
## - diploid
##
## Cross-Entropy (all data): 0.286974
## Cross-Entropy (masked data): 0.291713
## The project is saved into :
## genotypes.snmfProject
##
## To load the project, use:
## project = load.snmfProject("genotypes.snmfProject")
##
## To remove the project, use:
## remove.snmfProject("genotypes.snmfProject")
##
## [1] "*************************************"
## [1] "* sNMF K = 4 repetition 2 *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 12730
## -K (number of ancestral pops) 4
## -x (input file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
## -q (individual admixture file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run2/genotypes_r2.4.Q
## -g (ancestral frequencies file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run2/genotypes_r2.4.G
## -i (number max of iterations) 200
## -a (regularization parameter) 10
## -s (seed random init) 140459974247194
## -e (tolerance error) 1E-05
## -p (number of processes) 1
## - diploid
##
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno: OK.
##
##
## Main algorithm:
## [ ]
## [===========================]
## Number of iterations: 71
##
## Least-square error: 2104443.508871
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run2/genotypes_r2.4.Q: OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run2/genotypes_r2.4.G: OK.
##
## [1] "*************************************"
## [1] "* cross-entropy estimation *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 12730
## -K (number of ancestral pops) 4
## -x (genotype file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.geno
## -q (individual admixture) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run2/genotypes_r2.4.Q
## -g (ancestral frequencies) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run2/genotypes_r2.4.G
## -i (with masked genotypes) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
## - diploid
##
## Cross-Entropy (all data): 0.285638
## Cross-Entropy (masked data): 0.290789
## The project is saved into :
## genotypes.snmfProject
##
## To load the project, use:
## project = load.snmfProject("genotypes.snmfProject")
##
## To remove the project, use:
## remove.snmfProject("genotypes.snmfProject")
##
## [1] 445680127
## [1] "*************************************"
## [1] "* create.dataset *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 12730
## -s (seed random init) 445680127
## -r (percentage of masked data) 0.05
## -x (genotype file in .geno format) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.geno
## -o (output file in .geno format) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
##
## Write genotype file with masked data, /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno: OK.
##
## [1] "*************************************"
## [1] "* sNMF K = 1 repetition 3 *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 12730
## -K (number of ancestral pops) 1
## -x (input file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
## -q (individual admixture file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run3/genotypes_r3.1.Q
## -g (ancestral frequencies file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run3/genotypes_r3.1.G
## -i (number max of iterations) 200
## -a (regularization parameter) 10
## -s (seed random init) 445680127
## -e (tolerance error) 1E-05
## -p (number of processes) 1
## - diploid
##
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno: OK.
##
##
## Main algorithm:
##
## Least-square error: 2184839.526636
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run3/genotypes_r3.1.Q: OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run3/genotypes_r3.1.G: OK.
##
## [1] "*************************************"
## [1] "* cross-entropy estimation *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 12730
## -K (number of ancestral pops) 1
## -x (genotype file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.geno
## -q (individual admixture) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run3/genotypes_r3.1.Q
## -g (ancestral frequencies) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run3/genotypes_r3.1.G
## -i (with masked genotypes) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
## - diploid
##
## Cross-Entropy (all data): 0.297387
## Cross-Entropy (masked data): 0.300231
## The project is saved into :
## genotypes.snmfProject
##
## To load the project, use:
## project = load.snmfProject("genotypes.snmfProject")
##
## To remove the project, use:
## remove.snmfProject("genotypes.snmfProject")
##
## [1] "*************************************"
## [1] "* sNMF K = 2 repetition 3 *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 12730
## -K (number of ancestral pops) 2
## -x (input file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
## -q (individual admixture file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run3/genotypes_r3.2.Q
## -g (ancestral frequencies file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run3/genotypes_r3.2.G
## -i (number max of iterations) 200
## -a (regularization parameter) 10
## -s (seed random init) 140458761161215
## -e (tolerance error) 1E-05
## -p (number of processes) 1
## - diploid
##
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno: OK.
##
##
## Main algorithm:
## [ ]
## [======]
## Number of iterations: 16
##
## Least-square error: 2132647.984240
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run3/genotypes_r3.2.Q: OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run3/genotypes_r3.2.G: OK.
##
## [1] "*************************************"
## [1] "* cross-entropy estimation *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 12730
## -K (number of ancestral pops) 2
## -x (genotype file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.geno
## -q (individual admixture) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run3/genotypes_r3.2.Q
## -g (ancestral frequencies) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run3/genotypes_r3.2.G
## -i (with masked genotypes) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
## - diploid
##
## Cross-Entropy (all data): 0.290421
## Cross-Entropy (masked data): 0.294919
## The project is saved into :
## genotypes.snmfProject
##
## To load the project, use:
## project = load.snmfProject("genotypes.snmfProject")
##
## To remove the project, use:
## remove.snmfProject("genotypes.snmfProject")
##
## [1] "*************************************"
## [1] "* sNMF K = 3 repetition 3 *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 12730
## -K (number of ancestral pops) 3
## -x (input file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
## -q (individual admixture file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run3/genotypes_r3.3.Q
## -g (ancestral frequencies file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run3/genotypes_r3.3.G
## -i (number max of iterations) 200
## -a (regularization parameter) 10
## -s (seed random init) 4612136376258824703
## -e (tolerance error) 1E-05
## -p (number of processes) 1
## - diploid
##
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno: OK.
##
##
## Main algorithm:
## [ ]
## [===================]
## Number of iterations: 50
##
## Least-square error: 2115523.051283
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run3/genotypes_r3.3.Q: OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run3/genotypes_r3.3.G: OK.
##
## [1] "*************************************"
## [1] "* cross-entropy estimation *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 12730
## -K (number of ancestral pops) 3
## -x (genotype file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.geno
## -q (individual admixture) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run3/genotypes_r3.3.Q
## -g (ancestral frequencies) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run3/genotypes_r3.3.G
## -i (with masked genotypes) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
## - diploid
##
## Cross-Entropy (all data): 0.286917
## Cross-Entropy (masked data): 0.292771
## The project is saved into :
## genotypes.snmfProject
##
## To load the project, use:
## project = load.snmfProject("genotypes.snmfProject")
##
## To remove the project, use:
## remove.snmfProject("genotypes.snmfProject")
##
## [1] "*************************************"
## [1] "* sNMF K = 4 repetition 3 *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 12730
## -K (number of ancestral pops) 4
## -x (input file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
## -q (individual admixture file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run3/genotypes_r3.4.Q
## -g (ancestral frequencies file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run3/genotypes_r3.4.G
## -i (number max of iterations) 200
## -a (regularization parameter) 10
## -s (seed random init) 140458761161215
## -e (tolerance error) 1E-05
## -p (number of processes) 1
## - diploid
##
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno: OK.
##
##
## Main algorithm:
## [ ]
## [=========================]
## Number of iterations: 68
##
## Least-square error: 2104444.538962
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run3/genotypes_r3.4.Q: OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run3/genotypes_r3.4.G: OK.
##
## [1] "*************************************"
## [1] "* cross-entropy estimation *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 12730
## -K (number of ancestral pops) 4
## -x (genotype file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.geno
## -q (individual admixture) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run3/genotypes_r3.4.Q
## -g (ancestral frequencies) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run3/genotypes_r3.4.G
## -i (with masked genotypes) /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
## - diploid
##
## Cross-Entropy (all data): 0.285599
## Cross-Entropy (masked data): 0.291981
## The project is saved into :
## genotypes.snmfProject
##
## To load the project, use:
## project = load.snmfProject("genotypes.snmfProject")
##
## To remove the project, use:
## remove.snmfProject("genotypes.snmfProject")
par(mfrow=c(1,1), mar=c(3,3,3,1))
#project
# plot cross-entropy criterion of all runs of the project
plot(project, cex = 1.2, col = "blue", pch = 19)
# get the cross-entropy of all runs for K = 3
ce = cross.entropy(project, K = 2)
ce
## K = 2
## run 1 0.2943160
## run 2 0.2936740
## run 3 0.2949187
# select the run with the lowest cross-entropy for K = 2
best = which.min(ce)
# display the Q-matrix
Q.matrix <- as.matrix(Q(project, K = 2, run = best))
dim(Q.matrix)
## [1] 1000 2
cluster <- apply(Q.matrix, 1, which.max)
my.colors <- c("tomato", "lightblue", "olivedrab")#, "gold")
ord <- order(ind$envi)
dim(Q.matrix)
## [1] 1000 2
bp <- barplot(t(Q.matrix[ord,]),
border = NA,
space = 0,
col = my.colors,
xlab = "Individuals",
ylab = "Ancestry proportions",
main = "Ancestry matrix")
#axis(1, at = 1:nrow(Q.matrix), labels = bp$order, las = 3, cex.axis = .4)
# get the ancestral genotype frequency matrix, G, for the 2nd run for K = 4.
G.matrix = G(project, K = 3, run = 2)
write.geno(t(training$G[which_pruned,]), "genotypes_LD.geno")
## [1] "genotypes_LD.geno"
project_LD = snmf("genotypes_LD.geno",
K = 1:4,
entropy = TRUE,
repetitions = 3,
project = "new")
## The project is saved into :
## genotypes_LD.snmfProject
##
## To load the project, use:
## project = load.snmfProject("genotypes_LD.snmfProject")
##
## To remove the project, use:
## remove.snmfProject("genotypes_LD.snmfProject")
##
## [1] 794266710
## [1] "*************************************"
## [1] "* create.dataset *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 7438
## -s (seed random init) 794266710
## -r (percentage of masked data) 0.05
## -x (genotype file in .geno format) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.geno
## -o (output file in .geno format) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
##
## Write genotype file with masked data, /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno: OK.
##
## [1] "*************************************"
## [1] "* sNMF K = 1 repetition 1 *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 7438
## -K (number of ancestral pops) 1
## -x (input file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
## -q (individual admixture file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run1/genotypes_LD_r1.1.Q
## -g (ancestral frequencies file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run1/genotypes_LD_r1.1.G
## -i (number max of iterations) 200
## -a (regularization parameter) 10
## -s (seed random init) 140459109747798
## -e (tolerance error) 1E-05
## -p (number of processes) 1
## - diploid
##
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno: OK.
##
##
## Main algorithm:
##
## Least-square error: 1116941.990039
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run1/genotypes_LD_r1.1.Q: OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run1/genotypes_LD_r1.1.G: OK.
##
## [1] "*************************************"
## [1] "* cross-entropy estimation *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 7438
## -K (number of ancestral pops) 1
## -x (genotype file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.geno
## -q (individual admixture) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run1/genotypes_LD_r1.1.Q
## -g (ancestral frequencies) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run1/genotypes_LD_r1.1.G
## -i (with masked genotypes) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
## - diploid
##
## Cross-Entropy (all data): 0.261933
## Cross-Entropy (masked data): 0.263142
## The project is saved into :
## genotypes_LD.snmfProject
##
## To load the project, use:
## project = load.snmfProject("genotypes_LD.snmfProject")
##
## To remove the project, use:
## remove.snmfProject("genotypes_LD.snmfProject")
##
## [1] "*************************************"
## [1] "* sNMF K = 2 repetition 1 *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 7438
## -K (number of ancestral pops) 2
## -x (input file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
## -q (individual admixture file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run1/genotypes_LD_r1.2.Q
## -g (ancestral frequencies file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run1/genotypes_LD_r1.2.G
## -i (number max of iterations) 200
## -a (regularization parameter) 10
## -s (seed random init) 4602678819966913622
## -e (tolerance error) 1E-05
## -p (number of processes) 1
## - diploid
##
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno: OK.
##
##
## Main algorithm:
## [ ]
## [=======================]
## Number of iterations: 62
##
## Least-square error: 1109787.255171
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run1/genotypes_LD_r1.2.Q: OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run1/genotypes_LD_r1.2.G: OK.
##
## [1] "*************************************"
## [1] "* cross-entropy estimation *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 7438
## -K (number of ancestral pops) 2
## -x (genotype file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.geno
## -q (individual admixture) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run1/genotypes_LD_r1.2.Q
## -g (ancestral frequencies) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run1/genotypes_LD_r1.2.G
## -i (with masked genotypes) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
## - diploid
##
## Cross-Entropy (all data): 0.25924
## Cross-Entropy (masked data): 0.26154
## The project is saved into :
## genotypes_LD.snmfProject
##
## To load the project, use:
## project = load.snmfProject("genotypes_LD.snmfProject")
##
## To remove the project, use:
## remove.snmfProject("genotypes_LD.snmfProject")
##
## [1] "*************************************"
## [1] "* sNMF K = 3 repetition 1 *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 7438
## -K (number of ancestral pops) 3
## -x (input file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
## -q (individual admixture file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run1/genotypes_LD_r1.3.Q
## -g (ancestral frequencies file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run1/genotypes_LD_r1.3.G
## -i (number max of iterations) 200
## -a (regularization parameter) 10
## -s (seed random init) 140459109747798
## -e (tolerance error) 1E-05
## -p (number of processes) 1
## - diploid
##
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno: OK.
##
##
## Main algorithm:
## [ ]
## [==================================================]
## Number of iterations: 133
##
## Least-square error: 1105352.995343
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run1/genotypes_LD_r1.3.Q: OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run1/genotypes_LD_r1.3.G: OK.
##
## [1] "*************************************"
## [1] "* cross-entropy estimation *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 7438
## -K (number of ancestral pops) 3
## -x (genotype file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.geno
## -q (individual admixture) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run1/genotypes_LD_r1.3.Q
## -g (ancestral frequencies) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run1/genotypes_LD_r1.3.G
## -i (with masked genotypes) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
## - diploid
##
## Cross-Entropy (all data): 0.258182
## Cross-Entropy (masked data): 0.261556
## The project is saved into :
## genotypes_LD.snmfProject
##
## To load the project, use:
## project = load.snmfProject("genotypes_LD.snmfProject")
##
## To remove the project, use:
## remove.snmfProject("genotypes_LD.snmfProject")
##
## [1] "*************************************"
## [1] "* sNMF K = 4 repetition 1 *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 7438
## -K (number of ancestral pops) 4
## -x (input file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
## -q (individual admixture file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run1/genotypes_LD_r1.4.Q
## -g (ancestral frequencies file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run1/genotypes_LD_r1.4.G
## -i (number max of iterations) 200
## -a (regularization parameter) 10
## -s (seed random init) 140459109747798
## -e (tolerance error) 1E-05
## -p (number of processes) 1
## - diploid
##
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno: OK.
##
##
## Main algorithm:
## [ ]
## [=====================================================================]
## Number of iterations: 185
##
## Least-square error: 1101496.920349
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run1/genotypes_LD_r1.4.Q: OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run1/genotypes_LD_r1.4.G: OK.
##
## [1] "*************************************"
## [1] "* cross-entropy estimation *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 7438
## -K (number of ancestral pops) 4
## -x (genotype file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.geno
## -q (individual admixture) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run1/genotypes_LD_r1.4.Q
## -g (ancestral frequencies) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run1/genotypes_LD_r1.4.G
## -i (with masked genotypes) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
## - diploid
##
## Cross-Entropy (all data): 0.257073
## Cross-Entropy (masked data): 0.261336
## The project is saved into :
## genotypes_LD.snmfProject
##
## To load the project, use:
## project = load.snmfProject("genotypes_LD.snmfProject")
##
## To remove the project, use:
## remove.snmfProject("genotypes_LD.snmfProject")
##
## [1] 218275337
## [1] "*************************************"
## [1] "* create.dataset *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 7438
## -s (seed random init) 218275337
## -r (percentage of masked data) 0.05
## -x (genotype file in .geno format) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.geno
## -o (output file in .geno format) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
##
## Write genotype file with masked data, /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno: OK.
##
## [1] "*************************************"
## [1] "* sNMF K = 1 repetition 2 *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 7438
## -K (number of ancestral pops) 1
## -x (input file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
## -q (individual admixture file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run2/genotypes_LD_r2.1.Q
## -g (ancestral frequencies file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run2/genotypes_LD_r2.1.G
## -i (number max of iterations) 200
## -a (regularization parameter) 10
## -s (seed random init) 140458533756425
## -e (tolerance error) 1E-05
## -p (number of processes) 1
## - diploid
##
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno: OK.
##
##
## Main algorithm:
##
## Least-square error: 1117093.198247
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run2/genotypes_LD_r2.1.Q: OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run2/genotypes_LD_r2.1.G: OK.
##
## [1] "*************************************"
## [1] "* cross-entropy estimation *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 7438
## -K (number of ancestral pops) 1
## -x (genotype file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.geno
## -q (individual admixture) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run2/genotypes_LD_r2.1.Q
## -g (ancestral frequencies) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run2/genotypes_LD_r2.1.G
## -i (with masked genotypes) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
## - diploid
##
## Cross-Entropy (all data): 0.261897
## Cross-Entropy (masked data): 0.263818
## The project is saved into :
## genotypes_LD.snmfProject
##
## To load the project, use:
## project = load.snmfProject("genotypes_LD.snmfProject")
##
## To remove the project, use:
## remove.snmfProject("genotypes_LD.snmfProject")
##
## [1] "*************************************"
## [1] "* sNMF K = 2 repetition 2 *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 7438
## -K (number of ancestral pops) 2
## -x (input file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
## -q (individual admixture file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run2/genotypes_LD_r2.2.Q
## -g (ancestral frequencies file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run2/genotypes_LD_r2.2.G
## -i (number max of iterations) 200
## -a (regularization parameter) 10
## -s (seed random init) 140458533756425
## -e (tolerance error) 1E-05
## -p (number of processes) 1
## - diploid
##
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno: OK.
##
##
## Main algorithm:
## [ ]
## [==================]
## Number of iterations: 47
##
## Least-square error: 1110177.425147
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run2/genotypes_LD_r2.2.Q: OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run2/genotypes_LD_r2.2.G: OK.
##
## [1] "*************************************"
## [1] "* cross-entropy estimation *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 7438
## -K (number of ancestral pops) 2
## -x (genotype file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.geno
## -q (individual admixture) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run2/genotypes_LD_r2.2.Q
## -g (ancestral frequencies) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run2/genotypes_LD_r2.2.G
## -i (with masked genotypes) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
## - diploid
##
## Cross-Entropy (all data): 0.259193
## Cross-Entropy (masked data): 0.262317
## The project is saved into :
## genotypes_LD.snmfProject
##
## To load the project, use:
## project = load.snmfProject("genotypes_LD.snmfProject")
##
## To remove the project, use:
## remove.snmfProject("genotypes_LD.snmfProject")
##
## [1] "*************************************"
## [1] "* sNMF K = 3 repetition 2 *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 7438
## -K (number of ancestral pops) 3
## -x (input file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
## -q (individual admixture file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run2/genotypes_LD_r2.3.Q
## -g (ancestral frequencies file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run2/genotypes_LD_r2.3.G
## -i (number max of iterations) 200
## -a (regularization parameter) 10
## -s (seed random init) 140458533756425
## -e (tolerance error) 1E-05
## -p (number of processes) 1
## - diploid
##
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno: OK.
##
##
## Main algorithm:
## [ ]
## [===========================================================================]
## Number of iterations: 200
##
## Least-square error: 1105595.059315
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run2/genotypes_LD_r2.3.Q: OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run2/genotypes_LD_r2.3.G: OK.
##
## [1] "*************************************"
## [1] "* cross-entropy estimation *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 7438
## -K (number of ancestral pops) 3
## -x (genotype file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.geno
## -q (individual admixture) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run2/genotypes_LD_r2.3.Q
## -g (ancestral frequencies) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run2/genotypes_LD_r2.3.G
## -i (with masked genotypes) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
## - diploid
##
## Cross-Entropy (all data): 0.258112
## Cross-Entropy (masked data): 0.262279
## The project is saved into :
## genotypes_LD.snmfProject
##
## To load the project, use:
## project = load.snmfProject("genotypes_LD.snmfProject")
##
## To remove the project, use:
## remove.snmfProject("genotypes_LD.snmfProject")
##
## [1] "*************************************"
## [1] "* sNMF K = 4 repetition 2 *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 7438
## -K (number of ancestral pops) 4
## -x (input file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
## -q (individual admixture file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run2/genotypes_LD_r2.4.Q
## -g (ancestral frequencies file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run2/genotypes_LD_r2.4.G
## -i (number max of iterations) 200
## -a (regularization parameter) 10
## -s (seed random init) 140458533756425
## -e (tolerance error) 1E-05
## -p (number of processes) 1
## - diploid
##
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno: OK.
##
##
## Main algorithm:
## [ ]
## [=====================================]
## Number of iterations: 100
##
## Least-square error: 1101620.754549
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run2/genotypes_LD_r2.4.Q: OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run2/genotypes_LD_r2.4.G: OK.
##
## [1] "*************************************"
## [1] "* cross-entropy estimation *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 7438
## -K (number of ancestral pops) 4
## -x (genotype file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.geno
## -q (individual admixture) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run2/genotypes_LD_r2.4.Q
## -g (ancestral frequencies) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run2/genotypes_LD_r2.4.G
## -i (with masked genotypes) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
## - diploid
##
## Cross-Entropy (all data): 0.257056
## Cross-Entropy (masked data): 0.262195
## The project is saved into :
## genotypes_LD.snmfProject
##
## To load the project, use:
## project = load.snmfProject("genotypes_LD.snmfProject")
##
## To remove the project, use:
## remove.snmfProject("genotypes_LD.snmfProject")
##
## [1] 1153480913
## [1] "*************************************"
## [1] "* create.dataset *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 7438
## -s (seed random init) 1153480913
## -r (percentage of masked data) 0.05
## -x (genotype file in .geno format) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.geno
## -o (output file in .geno format) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
##
## Write genotype file with masked data, /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno: OK.
##
## [1] "*************************************"
## [1] "* sNMF K = 1 repetition 3 *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 7438
## -K (number of ancestral pops) 1
## -x (input file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
## -q (individual admixture file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run3/genotypes_LD_r3.1.Q
## -g (ancestral frequencies file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run3/genotypes_LD_r3.1.G
## -i (number max of iterations) 200
## -a (regularization parameter) 10
## -s (seed random init) 140459468962001
## -e (tolerance error) 1E-05
## -p (number of processes) 1
## - diploid
##
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno: OK.
##
##
## Main algorithm:
##
## Least-square error: 1116617.652157
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run3/genotypes_LD_r3.1.Q: OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run3/genotypes_LD_r3.1.G: OK.
##
## [1] "*************************************"
## [1] "* cross-entropy estimation *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 7438
## -K (number of ancestral pops) 1
## -x (genotype file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.geno
## -q (individual admixture) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run3/genotypes_LD_r3.1.Q
## -g (ancestral frequencies) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run3/genotypes_LD_r3.1.G
## -i (with masked genotypes) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
## - diploid
##
## Cross-Entropy (all data): 0.261804
## Cross-Entropy (masked data): 0.265613
## The project is saved into :
## genotypes_LD.snmfProject
##
## To load the project, use:
## project = load.snmfProject("genotypes_LD.snmfProject")
##
## To remove the project, use:
## remove.snmfProject("genotypes_LD.snmfProject")
##
## [1] "*************************************"
## [1] "* sNMF K = 2 repetition 3 *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 7438
## -K (number of ancestral pops) 2
## -x (input file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
## -q (individual admixture file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run3/genotypes_LD_r3.2.Q
## -g (ancestral frequencies file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run3/genotypes_LD_r3.2.G
## -i (number max of iterations) 200
## -a (regularization parameter) 10
## -s (seed random init) 9743415505
## -e (tolerance error) 1E-05
## -p (number of processes) 1
## - diploid
##
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno: OK.
##
##
## Main algorithm:
## [ ]
## [===============================]
## Number of iterations: 83
##
## Least-square error: 1109553.171174
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run3/genotypes_LD_r3.2.Q: OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run3/genotypes_LD_r3.2.G: OK.
##
## [1] "*************************************"
## [1] "* cross-entropy estimation *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 7438
## -K (number of ancestral pops) 2
## -x (genotype file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.geno
## -q (individual admixture) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run3/genotypes_LD_r3.2.Q
## -g (ancestral frequencies) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run3/genotypes_LD_r3.2.G
## -i (with masked genotypes) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
## - diploid
##
## Cross-Entropy (all data): 0.259097
## Cross-Entropy (masked data): 0.26412
## The project is saved into :
## genotypes_LD.snmfProject
##
## To load the project, use:
## project = load.snmfProject("genotypes_LD.snmfProject")
##
## To remove the project, use:
## remove.snmfProject("genotypes_LD.snmfProject")
##
## [1] "*************************************"
## [1] "* sNMF K = 3 repetition 3 *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 7438
## -K (number of ancestral pops) 3
## -x (input file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
## -q (individual admixture file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run3/genotypes_LD_r3.3.Q
## -g (ancestral frequencies file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run3/genotypes_LD_r3.3.G
## -i (number max of iterations) 200
## -a (regularization parameter) 10
## -s (seed random init) 140459468962001
## -e (tolerance error) 1E-05
## -p (number of processes) 1
## - diploid
##
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno: OK.
##
##
## Main algorithm:
## [ ]
## [=======================================================================]
## Number of iterations: 190
##
## Least-square error: 1105254.842039
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run3/genotypes_LD_r3.3.Q: OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run3/genotypes_LD_r3.3.G: OK.
##
## [1] "*************************************"
## [1] "* cross-entropy estimation *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 7438
## -K (number of ancestral pops) 3
## -x (genotype file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.geno
## -q (individual admixture) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run3/genotypes_LD_r3.3.Q
## -g (ancestral frequencies) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run3/genotypes_LD_r3.3.G
## -i (with masked genotypes) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
## - diploid
##
## Cross-Entropy (all data): 0.258024
## Cross-Entropy (masked data): 0.264091
## The project is saved into :
## genotypes_LD.snmfProject
##
## To load the project, use:
## project = load.snmfProject("genotypes_LD.snmfProject")
##
## To remove the project, use:
## remove.snmfProject("genotypes_LD.snmfProject")
##
## [1] "*************************************"
## [1] "* sNMF K = 4 repetition 3 *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 7438
## -K (number of ancestral pops) 4
## -x (input file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
## -q (individual admixture file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run3/genotypes_LD_r3.4.Q
## -g (ancestral frequencies file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run3/genotypes_LD_r3.4.G
## -i (number max of iterations) 200
## -a (regularization parameter) 10
## -s (seed random init) 140459468962001
## -e (tolerance error) 1E-05
## -p (number of processes) 1
## - diploid
##
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno: OK.
##
##
## Main algorithm:
## [ ]
## [===========================================================================]
## Number of iterations: 200
##
## Least-square error: 1101273.674557
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run3/genotypes_LD_r3.4.Q: OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run3/genotypes_LD_r3.4.G: OK.
##
## [1] "*************************************"
## [1] "* cross-entropy estimation *"
## [1] "*************************************"
## summary of the options:
##
## -n (number of individuals) 1000
## -L (number of loci) 7438
## -K (number of ancestral pops) 4
## -x (genotype file) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.geno
## -q (individual admixture) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run3/genotypes_LD_r3.4.Q
## -g (ancestral frequencies) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run3/genotypes_LD_r3.4.G
## -i (with masked genotypes) /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
## - diploid
##
## Cross-Entropy (all data): 0.25694
## Cross-Entropy (masked data): 0.263926
## The project is saved into :
## genotypes_LD.snmfProject
##
## To load the project, use:
## project = load.snmfProject("genotypes_LD.snmfProject")
##
## To remove the project, use:
## remove.snmfProject("genotypes_LD.snmfProject")
#project
# plot cross-entropy criterion of all runs of the project
plot(project_LD, cex = 1.2, col = "blue", pch = 19)
# get the cross-entropy of all runs for K = 3
ce = cross.entropy(project_LD, K = 2)
ce
## K = 2
## run 1 0.2615403
## run 2 0.2623173
## run 3 0.2641197
# select the run with the lowest cross-entropy for K = 2
best = which.min(ce)
# display the Q-matrix
Q.matrix <- as.matrix(Q(project_LD, K = 2, run = best))
dim(Q.matrix)
## [1] 1000 2
cluster <- apply(Q.matrix, 1, which.max)
my.colors <- c("tomato", "lightblue")#, "olivedrab")#, "gold")
ord <- order(ind$envi)
dim(Q.matrix)
## [1] 1000 2
par(mfrow=c(1,1), mar=c(4,4,3,1))
bp <- barplot(t(Q.matrix[ord,]),
border = NA,
space = 0,
col = my.colors,
xlab = "Individuals (sorted by environment)",
ylab = "Ancestry proportions",
main = "Ancestry matrix")
#axis(1, at = 1:nrow(Q.matrix), labels = bp$order, las = 3, cex.axis = .4)
# get the ancestral genotype frequency matrix, G, for the 2nd run for K = 4.
G.matrix = G(project, K = 3, run = 2)
dim(training$G)
## [1] 12730 1000
FstDataFrame <- MakeDiploidFSTMat(t(training$G),training$position,ind$group)
## Calculating FSTs, may take a few minutes...
## [1] "10000 done of 12730"
head(FstDataFrame)
## LocusName He FST T1 T2 FSTNoCorr
## 1 43 0.0019980 0.003210998 3.209683e-06 0.0009995906 0.02265019
## 2 104 0.0158720 0.002094303 1.662963e-05 0.0079404119 0.02143843
## 3 160 0.0647555 0.033544517 1.087723e-03 0.0324262676 0.05356826
## 4 204 0.0009995 0.002397900 1.199032e-06 0.0005000340 0.02187832
## 5 214 0.0029955 0.025341903 3.800214e-05 0.0014995771 0.04391575
## 6 275 0.0029955 0.004571143 6.850739e-06 0.0014986929 0.02394794
## T1NoCorr T2NoCorr meanAlleleFreq
## 1 2.264259e-05 0.0009996647 0.9990
## 2 1.702425e-04 0.0079409973 0.9920
## 3 1.737151e-03 0.0324287423 0.9665
## 4 1.094072e-05 0.0005000712 0.9995
## 5 6.585972e-05 0.0014996832 0.9985
## 6 3.589326e-05 0.0014988036 0.9985
#str(FstDataFrame)
plot(FstDataFrame$He, FstDataFrame$FST)
plot(as.numeric(FstDataFrame$LocusName)[FstDataFrame$He>0.1], FstDataFrame$FST[FstDataFrame$He>0.1], ylim=c(0,0.1))
plot_layers(y_head=0.1, y_arrows=c(0.1,0.06))
k <- 39 ## Number of pops
out_ini <- OutFLANK(FstDataFrame, NumberOfSamples=k)
## Run outflank on FST dataframe
#out_ini <- OutFLANK(FstDataFrame[FstDataFrame$He>0.05,], NumberOfSamples=k)
## Run outflank without low He loci
# Plot results to compare chi-squared distribution vs. actual FST distribution
OutFLANKResultsPlotter(out_ini, withOutliers = TRUE,
NoCorr = TRUE, Hmin = 0.1, binwidth = 0.001, Zoom =
FALSE, RightZoomFraction = 0.05, titletext = NULL)
## Poor fit, particularly on right tail
OutFLANKResultsPlotter(out_ini, withOutliers = TRUE,
NoCorr = TRUE, Hmin = 0.1, binwidth = 0.001, Zoom =
TRUE, RightZoomFraction = 0.15, titletext = NULL)
# Histogram of P-values weird
hist(out_ini$results$pvaluesRightTail,breaks = 20)
#### LD Pruning ####
#### Evaluating OutFLANK with pruned data ####
plot(FstDataFrame$He[which_pruned], FstDataFrame$FST[which_pruned])
Fstdf2 <- FstDataFrame[which_pruned,]
dim(Fstdf2)
## [1] 7438 9
Fstdf3 <- Fstdf2[Fstdf2$He>0.05,]
### Trimming without He trimming
out_trim1 <- OutFLANK(Fstdf2, NumberOfSamples=k, Hmin = 0.05)
OutFLANKResultsPlotter(out_trim1, withOutliers = TRUE,
NoCorr = TRUE, Hmin = 0.1, binwidth = 0.001, Zoom =
FALSE, RightZoomFraction = 0.15, titletext = NULL)
out_trim <- OutFLANK(Fstdf3, NumberOfSamples=k)
# I do not think that OutFLANK is removing low H loci correctly
# The fit is much better if I remove these manually than if I do not
head(out_trim$results)
## LocusName He FST T1 T2 FSTNoCorr
## 3 160 0.0647555 0.033544517 0.0010877235 0.03242627 0.05356826
## 7 311 0.0994875 0.005853766 0.0002913773 0.04977604 0.02451949
## 14 566 0.4890480 0.016621863 0.0040684189 0.24476312 0.03576409
## 17 627 0.4999395 0.011441064 0.0028623924 0.25018585 0.03210544
## 20 687 0.2854875 0.022495971 0.0032147429 0.14290305 0.04028350
## 22 726 0.1563795 0.017020085 0.0013321117 0.07826704 0.03611756
## T1NoCorr T2NoCorr meanAlleleFreq indexOrder GoodH qvalues
## 3 0.001737151 0.03242874 0.9665 1 lowH 0.6983500
## 7 0.001220570 0.04977958 0.9475 2 lowH 0.9983058
## 14 0.008754369 0.24478098 0.5740 3 goodH 0.9983058
## 17 0.008032960 0.25020555 0.4945 4 goodH 0.9983058
## 20 0.005757025 0.14291274 0.8275 5 goodH 0.9983058
## 22 0.002827020 0.07827273 0.9145 6 goodH 0.9983058
## pvalues pvaluesRightTail OutlierFlag
## 3 0.02555251 0.01277625 FALSE
## 7 0.28227599 0.85886200 FALSE
## 14 0.67903024 0.33951512 FALSE
## 17 0.97532025 0.51233987 FALSE
## 20 0.35583083 0.17791541 FALSE
## 22 0.64902077 0.32451039 FALSE
plot(out_trim$results$He, out_trim$results$FST)
OutFLANKResultsPlotter(out_trim, withOutliers = TRUE,
NoCorr = TRUE, Hmin = 0.1, binwidth = 0.001, Zoom =
FALSE, RightZoomFraction = 0.15, titletext = NULL)
OutFLANKResultsPlotter(out_trim, withOutliers = TRUE,
NoCorr = TRUE, Hmin = 0.1, binwidth = 0.001, Zoom =
TRUE, RightZoomFraction = 0.10, titletext = NULL)
# Decent distribution fit, no trimming needed.
hist(out_trim$results$pvaluesRightTail,breaks = 20)
# still a conservative histogram, which is typical of outflank
outliers_crit <- out_trim$results$qvalues<0.2
(outliers_LD <- out_trim$results$LocusName[outliers_crit])
## [1] 498325 84667 95065 190575 191782 264811
# outliers identified
### Plot trimmed data only
plot(out_trim$results$LocusName,out_trim$results$FST, ylim=c(0,0.1), xaxs="i")
plot_layers(y_head=0.1, y_arrows=c(0.1,0.06))
points(out_trim$results$LocusName[outliers_crit],
out_trim$results$FST[outliers_crit], pch=19)
### Ad-hoc estimates of p-values for all data
Fstdf_adhoc <- FstDataFrame
new_dist <- FstDataFrame$FSTNoCorr*out_trim$dfInferred/out_trim$FSTNoCorrbar
hist(new_dist)
Fstdf_adhoc$p <- pchisq(new_dist, df = out_trim$dfInferred)
Fstdf_adhoc$p[Fstdf_adhoc$He<0.1] <- NA
hist(Fstdf_adhoc$p)
plot(-log10(1-Fstdf_adhoc$p), Fstdf_adhoc$FST)
Fstdf_adhoc$q <- qvalue(1-Fstdf_adhoc$p)$qvalues
plot(Fstdf_adhoc$q, Fstdf_adhoc$FST)
plot(as.numeric(Fstdf_adhoc$LocusName)[Fstdf_adhoc$He>0.1], Fstdf_adhoc$FST[Fstdf_adhoc$He>0.1], ylim=c(0,0.1), xaxs="i")
plot_layers(y_head=0.1, y_arrows=c(0.09, 0.07))
points(Fstdf_adhoc$LocusName[Fstdf_adhoc$q<0.2], Fstdf_adhoc$FST[Fstdf_adhoc$q<0.2], pch=20)
plot(x, option = "qqplot", threshold = 0.05, main="pcadapt")
#plot(x, option = "stat.distribution")
summary(x$chi2.stat)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 1.87 3.36 4225.89 6.12 156361.97 8421
# Default output from PCAdapt
par(mfrow=c(1,1))
plot(training$position, sqrt(x$chi2.stat), col="black", pch=20, main="pcadapt without LD pruning", ylim=c(0,500))
plot_layers(y_head=450, y_arrows = c(50,0))
plot(x_LD, option = "qqplot", threshold = 0.05, main="pcadapt")
#plot(x, option = "stat.distribution")
summary(x_LD$chi2.stat)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.010 1.208 2.366 3.327 4.302 59.470 5239
# Default output from PCAdapt
par(mfrow=c(1,1))
plot(training$position[which_pruned], x_LD$chi2.stat, col="black", pch=20, main="pcadapt without LD pruning", ylim=c(0,100))
plot_layers(y_head=450, y_arrows = c(100,70))
# extract scaled genotypes
scaled.genotype <- scale(as.matrix(t(training$G)))
#scaled.genotype <- as.matrix(t(sim1$G))
# extract scaled phenotypes
phen <- scale(as.matrix(ind$phenotype1))
# centering is important to remove mean
# x <- scale(as.matrix(sim1$phenotype1), center=TRUE, scale=FALSE)
# x <- as.matrix(sim1$phenotype1)
# to do mean and not SD. this might make it possible to get effect sizes
#pc <- prcomp(scaled.genotype,)
#plot(pc$sdev[1:20]^2)
#points(5,pc$sdev[5]^2, type = "h", lwd = 3, col = "blue")
# ridge regression
lfmm.ridge <- lfmm::lfmm_ridge(Y = scaled.genotype, X = phen, K = 3, lambda = 1e-4)
#The lfmm.ridge object contains estimates for the latent variables and for the effect sizes. Here, the estimates are used for computing calibrated significance values and for testing associations between the response matrix Y and the explanatory variable x. It can be done as follows:
lfmm.test <- lfmm::lfmm_test(Y = scaled.genotype, X = phen, lfmm = lfmm.ridge, calibrate = "gif")
p.values <- lfmm.test$calibrated.pvalue
lfmm.test$gif
## [1] 3.983324
hist(p.values, col = "lightgreen", main="LFMM ridge")
qval <- qvalue::qvalue(p.values)
plot(qval)
#The plot suggests that setting fdr.level = 0.025 warrant few false positives.
qval <- qvalue::qvalue(p.values, fdr.level = 0.005)
candidates <- which(qval$significant)
plot(training$position, -log10(p.values), cex = .5, pch = 19, col = "black", main="LFMM ridge", ylim=c(0, 60))
plot_layers(y_head=55, y_arrows=c(10, 0))
#### Effect sizes ridge regression step 1: run lfmm_ridge (or any lfmm model) and get the estimated latent factors from the U matrix (obj.lfmm$U). When lfmm is run with K factors and n individuals, U is an n by K matrix.
step 2: perform a standard linear regression analysis of the phenotype on the SNP frequency (in the direction opposite to LFMM) by adding U as covariate to the model. This will estimate the LFMM effect size for each SNP. The R command should look like this: lm( y ~ . , data = data.frame(genotype[,i], U))
str(lfmm.ridge)
## List of 5
## $ K : num 3
## $ lambda: num 1e-04
## $ U : num [1:1000, 1:3] 10.97 -26.98 11.8 -6.45 11.18 ...
## $ V : num [1:12730, 1:3] 0.003738 -0.003444 0.001665 0.002643 -0.000452 ...
## $ B : num [1:12730, 1] 0.00675 0.04897 -0.0848 0.00549 0.02794 ...
## - attr(*, "class")= chr "ridgeLFMM"
m2 <- which(training$position %in% (muts$position[muts$type=="m2" & muts$count==TRUE]))
dim(G)
## [1] 12736 1000
effects <- data.frame(position=training$position[m2], est_coef_ridge=NA, est_coef_PC=NA)
### Try Olivier's suggestion
for (i in 1:length(m2)){
effects$est_coef_ridge[i] <- lm(phen ~., data = data.frame(gen = training$G[m2[i],], lfmm.ridge$U))$coef[2]
}
### Use PC axes as covariates
for (i in 1:length(m2)){
effects$est_coef_PC[i] <- lm(phen ~., data = data.frame(gen = training$G[m2[i],], x_LD$scores))$coef[2]
}
effects
## position est_coef_ridge est_coef_PC
## 1 110590 0.2589593 0.07340806
(new_muts <- merge(muts,effects))
## position selCoef originGen type freq pa2 prop count
## 1 110590 0.146014 4546 m2 0.754 0.004 0.03571429 TRUE
## est_coef_ridge est_coef_PC
## 1 0.2589593 0.07340806
plot(new_muts$selCoef, new_muts$est_coef_ridge, abline(0,1), col="blue", pch=19, xlab="True effect size", ylab="Estimated effect size")
points(new_muts$selCoef, new_muts$est_coef_PC, abline(0,1), col="green", pch=15)
#LFMM parameters can alternatively be estimated by solving regularized least-squares mimimization, with lasso penalty as follows.
lfmm.lasso <- lfmm::lfmm_lasso(Y = scaled.genotype, X = phen, K = 3, nozero.prop = 0.02)
## It = 1/100, err2 = 0.999000000049747
## It = 2/100, err2 = 0.988683353488457
## It = 3/100, err2 = 0.988694883822047
## It = 4/100, err2 = 0.988699730079893
## It = 5/100, err2 = 0.988700739298245
## === lambda = 0.151284210384581, no zero B proportion = 0.0232521602513747
#The lfmm.lasso object contains new estimates for the latent variables and for the effect sizes. Note that for lasso, we didn't set the value of a regularization parameter. Instead, we set the proportion of non-null effects (here 2 percent).
lfmm.test <- lfmm::lfmm_test(Y = scaled.genotype, X = phen, lfmm = lfmm.lasso, calibrate = "gif")
p.values <- lfmm.test$calibrated.pvalue
lfmm.test$gif
## [1] 1.548855
hist(p.values, col = "lightblue")
qval <- qvalue::qvalue(p.values)
plot(qval)
qval <- qvalue::qvalue(p.values, fdr.level = 0.005)
candidates <- which(qval$significant)
plot(training$position, -log10(p.values), cex = .5, pch = 19, col = "black", main="LFMM lasso", ylim=c(0, 50), xaxs="i")
plot_layers(y_head=45, y_arrows=c(5,0))
library(rehh)
## Loading required package: rehh.data
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
### Convert vcf@gt to haplotype format .thap
# one file for each chromosome
#SNP1 SNP2 SNP3
#IND1 hap1 A T A
#IND1 hap2 A C T
#IND2 hap1 G C T
#IND2 hap2 A T A
nlociperchrom <- table(vcf@fix[,"CHROM"])
substring("0|1", 1, last=1)
## [1] "0"
substring("0|1", 3, last=3)
## [1] "1"
head(vcf_filt)
## [1] "***** Object of class 'vcfR' *****"
## [1] "***** Meta section *****"
## [1] "##fileformat=VCFv4.2"
## [1] "##fileDate=20170924"
## [1] "##source=SLiM"
## [1] "##INFO=<ID=MID,Number=1,Type=Integer,Description=\"Mutation ID in SLiM\">"
## [1] "##INFO=<ID=S,Number=1,Type=Float,Description=\"Selection Coefficient\">"
## [1] "##INFO=<ID=DOM,Number=1,Type=Float,Description=\"Dominance\">"
## [1] "First 6 rows."
## [1]
## [1] "***** Fixed section *****"
## CHROM POS ID REF ALT QUAL FILTER
## [1,] "1" "43" NA "A" "T" "1000" "PASS"
## [2,] "1" "104" NA "A" "T" "1000" "PASS"
## [3,] "1" "160" NA "A" "T" "1000" "PASS"
## [4,] "1" "204" NA "A" "T" "1000" "PASS"
## [5,] "1" "214" NA "A" "T" "1000" "PASS"
## [6,] "1" "275" NA "A" "T" "1000" "PASS"
## [1]
## [1] "***** Genotype section *****"
## FORMAT i0 i1 i2 i3 i4
## [1,] "GT" "0|0" "0|0" "0|0" "0|0" "0|0"
## [2,] "GT" "0|0" "0|0" "0|0" "0|0" "0|0"
## [3,] "GT" "0|0" "0|0" "0|0" "0|0" "0|0"
## [4,] "GT" "0|0" "0|0" "0|0" "0|0" "0|0"
## [5,] "GT" "0|0" "0|0" "0|0" "0|0" "0|0"
## [6,] "GT" "0|0" "0|0" "0|0" "0|0" "0|0"
## [1] "First 6 columns only."
## [1]
## [1] "Unique GT formats:"
## [1] "GT"
## [1]
rem2 <- which(duplicated(vcf_filt@fix[,"POS"]))
sort(vcf_filt@fix[rem2,"POS"])
## [1] "100074" "100091" "100972" "106904" "110255" "118610" "119269"
## [8] "119621" "121090" "122662" "129555" "133804" "136833" "144318"
## [15] "149174" "149498" "161735" "167332" "16845" "169909" "172360"
## [22] "176123" "184649" "185817" "188068" "189350" "189451" "193490"
## [29] "195752" "196359" "196407" "197840" "199591" "200698" "206956"
## [36] "210085" "216495" "219722" "225496" "227522" "230671" "237178"
## [43] "237700" "242152" "245561" "245615" "248539" "263579" "265707"
## [50] "27021" "284608" "290053" "291324" "294300" "296710" "297069"
## [57] "298187" "300539" "301776" "307744" "311448" "316403" "319403"
## [64] "31971" "322187" "325382" "328998" "333698" "340411" "343427"
## [71] "344190" "34624" "348053" "348665" "350232" "353102" "355416"
## [78] "356165" "357666" "358453" "359762" "362735" "364094" "372484"
## [85] "372804" "374444" "375849" "376534" "380272" "38119" "382815"
## [92] "382934" "383330" "38347" "383881" "387728" "390172" "390625"
## [99] "395176" "406774" "408853" "409483" "411912" "422281" "424722"
## [106] "424824" "425698" "428187" "43123" "434423" "436752" "440005"
## [113] "442018" "444233" "446075" "449955" "450142" "456033" "460666"
## [120] "46096" "462406" "464065" "465567" "475793" "476720" "479024"
## [127] "483296" "48421" "485875" "492607" "495206" "498223" "498470"
## [134] "51415" "51987" "54824" "57585" "60225" "62208" "64342"
## [141] "64983" "70207" "70319" "76531" "79041" "81321" "84987"
## [148] "85505" "87397" "92842" "97730" "99544" "99627" "99693"
vcf_filt2 = vcf_filt[-rem2,]
### Get into right format
for (i in 1:length(nlociperchrom)){
keep <- which((vcf_filt2@fix[,"CHROM"]==i))
head(vcf_filt2@gt[keep,1:10], 10)
hap1 <- apply(vcf_filt2@gt[keep,-1], 1, FUN=function(x) substring(x,1,1))
dim(hap1)
#head(hap1[,1:10])
hap2 <- apply(vcf_filt2@gt[keep,-1], 1, FUN=function(x) substring(x,3,3))
#head(hap2[,1:10])
hapt_out <- matrix(NA, nrow=2*1000, ncol=length(keep))
odd <- seq(1,(2*1000), by=2)
even <- odd +1
hapt_out[odd,] <- as.numeric(hap1)
rownames(hapt_out) <- rep("", nrow(hapt_out))
rownames(hapt_out)[odd] <- rownames(hap1)
rownames(hapt_out)[even] <- rownames(hap2)
hapt_out[even,] <- as.numeric(hap2)
#head(hapt_out[,1:10])
write.table(cbind(rownames(hapt_out), hapt_out+1), paste0("chrom",i,".thap"), row.names=F, col.names=F, quote = FALSE)
}
#a<- read.table("chrom1.thap")
### Also need to convert map.inp
#Each line contains five columns corresponding to:
#1. the SNP name
#2. the SNP chromosome (or scaffold) of origin
#3. the SNP position on the chromosome (or scaffold). Note that it is up to the user to choose either
#physical or genetic map positions (if available).
#4. the SNP ancestral allele (as coded in the haplotype input file)
#5. the SNP derived alleles (as coded in the haplotype input file)
map <- data.frame(name=1:nrow(vcf_filt2), chrom=as.numeric(vcf_filt2@fix[,"CHROM"]),
pos=as.numeric(vcf_filt2@fix[,"POS"]), anc=1, derived=2)
# setting anc=0 and derived = 1 thinks missing data
head(map)
## name chrom pos anc derived
## 1 1 1 43 1 2
## 2 2 1 104 1 2
## 3 3 1 160 1 2
## 4 4 1 204 1 2
## 5 5 1 214 1 2
## 6 6 1 275 1 2
which(duplicated(map$pos))
## integer(0)
write.table(map, "map.inp", row.names=F, col.names=F)
cnt=0
for(i in 1:length(nlociperchrom)){
cnt=cnt+1
tmp.hapfile=paste0("chrom",i,".thap")
tmp.hap=data2haplohh(hap_file=tmp.hapfile, map_file="map.inp", chr.name=i,haplotype.in.columns=FALSE)
tmp.scan=scan_hh(tmp.hap,threads=4)
if(cnt==1){wgscan=tmp.scan}else{wgscan=rbind(wgscan,tmp.scan)}
}
## Map file seems OK: 1294 SNPs declared for chromosome 1
## Standard rehh input file assumed
## Discard Haplotype with less than 100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than 100 % of haplotypes
## No SNP discarded
## Data consists of 2000 haplotypes and 1294 SNPs
## Map file seems OK: 1314 SNPs declared for chromosome 2
## Standard rehh input file assumed
## Discard Haplotype with less than 100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than 100 % of haplotypes
## No SNP discarded
## Data consists of 2000 haplotypes and 1314 SNPs
## Map file seems OK: 1253 SNPs declared for chromosome 3
## Standard rehh input file assumed
## Discard Haplotype with less than 100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than 100 % of haplotypes
## No SNP discarded
## Data consists of 2000 haplotypes and 1253 SNPs
## Map file seems OK: 1250 SNPs declared for chromosome 4
## Standard rehh input file assumed
## Discard Haplotype with less than 100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than 100 % of haplotypes
## No SNP discarded
## Data consists of 2000 haplotypes and 1250 SNPs
## Map file seems OK: 1313 SNPs declared for chromosome 5
## Standard rehh input file assumed
## Discard Haplotype with less than 100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than 100 % of haplotypes
## No SNP discarded
## Data consists of 2000 haplotypes and 1313 SNPs
## Map file seems OK: 1147 SNPs declared for chromosome 6
## Standard rehh input file assumed
## Discard Haplotype with less than 100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than 100 % of haplotypes
## No SNP discarded
## Data consists of 2000 haplotypes and 1147 SNPs
## Map file seems OK: 1275 SNPs declared for chromosome 7
## Standard rehh input file assumed
## Discard Haplotype with less than 100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than 100 % of haplotypes
## No SNP discarded
## Data consists of 2000 haplotypes and 1275 SNPs
## Map file seems OK: 1227 SNPs declared for chromosome 8
## Standard rehh input file assumed
## Discard Haplotype with less than 100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than 100 % of haplotypes
## No SNP discarded
## Data consists of 2000 haplotypes and 1227 SNPs
## Map file seems OK: 1250 SNPs declared for chromosome 9
## Standard rehh input file assumed
## Discard Haplotype with less than 100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than 100 % of haplotypes
## No SNP discarded
## Data consists of 2000 haplotypes and 1250 SNPs
## Map file seems OK: 1253 SNPs declared for chromosome 10
## Standard rehh input file assumed
## Discard Haplotype with less than 100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than 100 % of haplotypes
## No SNP discarded
## Data consists of 2000 haplotypes and 1253 SNPs
dim(wgscan)
## [1] 12576 7
ihs=ihh2ihs(wgscan,minmaf=0.05,freqbin=0.05)
head(ihs$iHS,25)
## CHR POSITION iHS -log10(p-value)
## 7 1 311 NA NA
## 14 1 566 NA NA
## 17 1 627 NA NA
## 19 1 634 NA NA
## 20 1 687 NA NA
## 22 1 726 NA NA
## 24 1 863 NA NA
## 27 1 932 NA NA
## 31 1 1120 NA NA
## 32 1 1171 NA NA
## 34 1 1216 NA NA
## 35 1 1247 NA NA
## 41 1 1468 NA NA
## 46 1 1699 NA NA
## 48 1 1811 NA NA
## 49 1 1819 NA NA
## 54 1 1946 NA NA
## 59 1 2110 NA NA
## 63 1 2228 NA NA
## 66 1 2486 NA NA
## 70 1 2675 -0.1092506 0.03952743
## 80 1 2945 NA NA
## 81 1 2962 NA NA
## 83 1 3018 NA NA
## 84 1 3024 NA NA
tail(ihs$iHS)
## CHR POSITION iHS -log10(p-value)
## 12554 10 499130 NA NA
## 12561 10 499367 NA NA
## 12567 10 499629 NA NA
## 12568 10 499739 NA NA
## 12569 10 499764 NA NA
## 12576 10 499989 NA NA
ihs$frequency.class
## Number of SNPs mean of the log(iHHA/iHHD) ratio
## 0.05 - 0.1 75 0.951238483
## 0.1 - 0.15 76 0.588511796
## 0.15 - 0.2 79 0.421991438
## 0.2 - 0.25 76 0.377900862
## 0.25 - 0.3 114 0.190045269
## 0.3 - 0.35 147 0.202779781
## 0.35 - 0.4 95 0.005106286
## 0.4 - 0.45 111 -0.069256982
## 0.45 - 0.5 114 -0.140772122
## 0.5 - 0.55 157 -0.177965029
## 0.55 - 0.6 189 -0.255648542
## 0.6 - 0.65 158 -0.279530058
## 0.65 - 0.7 248 -0.404833732
## 0.7 - 0.75 284 -0.523388652
## 0.75 - 0.8 296 -0.558672530
## 0.8 - 0.85 411 -0.743383397
## 0.85 - 0.9 579 -0.939585453
## 0.9 - 0.95 1064 -1.296392187
## sd of the log(iHHA/iHHD) ratio
## 0.05 - 0.1 0.5309461
## 0.1 - 0.15 0.3936370
## 0.15 - 0.2 0.4217720
## 0.2 - 0.25 0.4080201
## 0.25 - 0.3 0.3778977
## 0.3 - 0.35 0.3972260
## 0.35 - 0.4 0.4017517
## 0.4 - 0.45 0.3992746
## 0.45 - 0.5 0.4154989
## 0.5 - 0.55 0.3704670
## 0.55 - 0.6 0.4288702
## 0.6 - 0.65 0.3961157
## 0.65 - 0.7 0.3916281
## 0.7 - 0.75 0.3976893
## 0.75 - 0.8 0.4482664
## 0.8 - 0.85 0.4605596
## 0.85 - 0.9 0.4979967
## 0.9 - 0.95 0.5904876
#distribplot(ihs$iHS$iHS)
#ihsplot(ihs)
ihs$iHS[which(ihs$iHS[,4]>2),]
## CHR POSITION iHS -log10(p-value)
## 147 1 5169 -2.666785 2.115880
## 637 1 23403 -3.079055 2.682651
## 1416 2 54838 3.097942 2.710256
## 1949 2 75287 2.675152 2.126706
## 2024 2 77878 2.835195 2.339156
## 2262 2 86607 3.401843 3.174360
## 2267 2 86737 -3.373727 3.129843
## 2269 2 86755 -2.701493 2.160968
## 2286 2 87349 -2.977203 2.536270
## 2490 2 96094 -2.684031 2.138225
## 3114 3 119525 -2.610321 2.043557
## 3471 3 133910 2.584122 2.010428
## 6194 5 241386 -2.595876 2.025258
## 6856 6 269724 -2.733836 2.203418
## 7191 6 285093 -3.389856 3.155341
## 7949 7 315080 2.680613 2.133787
## 8089 7 320515 -2.609330 2.042298
## 8394 7 330460 2.885329 2.407818
## 9297 8 368887 -3.238545 2.920309
## 9318 8 369659 -2.885205 2.407647
## 9534 8 378729 -3.047222 2.636450
## 9547 8 379372 -2.641691 2.083581
## 9746 8 387622 -2.598707 2.028837
## 10559 9 420454 2.604928 2.036715
## 10638 9 423702 2.801077 2.293006
## 10639 9 423720 2.766193 2.246302
## 10724 9 427249 2.643252 2.085583
## 11221 9 446192 2.597399 2.027183
plot(ihs$iHS$POSITION,ihs$iHS$`-log10(p-value)`, col="grey", pch=20, main="REHH iHS")
plot_layers()
plot(ihs$iHS$POSITION,ihs$iHS$iHS, col="grey", pch=20, main="REHH iHS")
plot_layers()